package com.kylecorry.trail_sense.shared.text.nlp.processors

class EnglishStopWordRemover(private val additionalStopWords: Set<String> = emptySet()): TokenProcessor {
    private val stopWords = setOf(
        "i",
        "me",
        "my",
        "myself",
        "we",
        "our",
        "ours",
        "ourselves",
        "you",
        "your",
        "yours",
        "yourself",
        "yourselves",
        "he",
        "him",
        "his",
        "himself",
        "she",
        "it",
        "its",
        "itself",
        "they",
        "them",
        "their",
        "theirs",
        "themselves",
        "what",
        "which",
        "who",
        "whom",
        "this",
        "that",
        "these",
        "those",
        "am",
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "having",
        "do",
        "does",
        "did",
        "doing",
        "a",
        "an",
        "the",
        "and",
        "but",
        "if",
        "or",
        "because",
        "as",
        "until",
        "while",
        "of",
        "at",
        "by",
        "for",
        "with",
        "about",
        "against",
        "between",
        "into",
        "through",
        "during",
        "before",
        "after",
        "above",
        "below",
        "to",
        "from",
        "up",
        "down",
        "in",
        "out",
        "on",
        "off",
        "over",
        "under",
        "again",
        "further",
        "then",
        "once",
        "here",
        "there",
        "when",
        "where",
        "why",
        "how",
        "all",
        "any",
        "both",
        "each",
        "few",
        "more",
        "most",
        "other",
        "some",
        "such",
        "no",
        "nor",
        "not",
        "only",
        "own",
        "same",
        "so",
        "than",
        "too",
        "very",
        "can",
        "will",
        "just",
        "do",
        "should",
        "now",
        "are",
        "might",
        "must",
        "shall",
        "should",
        "need",
        "were",
        "would",
        "went",
        "like",
        "at",
        "much",
        "onto",
        // Some common contractions without the apostrophe
        "im",
        "ive",
        "youre",
        "hes",
        "shes",
        "its",
        "were",
        "theyre",
        "dont",
        "doesnt",
        "didnt",
        "wasnt",
        "werent",
        "arent",
        "wont",
        "cant",
        "couldnt",
        "shouldnt",
        "wouldnt",
        "isnt",
        "hasnt",
        "havent",
        "hadnt",
        "shouldve",
        "couldve",
        "wouldve",
        "mightve",
        "mustve",
        "shallve",
        "wontve",
        "cantve",
        "aint",
        "hows"
    )

    /**
     * Removes stop words. Assumes that the stings have been run through a contraction splitter and are all lowercase.
     */
    fun clean(words: List<String>): List<String> {
        return words.filter { it !in stopWords && it !in additionalStopWords }.map {
            if (it.endsWith("'s")) {
                it.substring(0, it.length - 2)
            } else {
                it
            }
        }
    }

    override fun process(tokens: List<String>): List<String> {
        return clean(tokens)
    }
}